import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
from sklearn.svm import SVC
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import zscore
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn import model_selection
import warnings
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
import warnings; warnings.filterwarnings('ignore')
pd.options.display.max_rows = 4000
df1 = pd.read_csv('Part1 - Car name.csv')
df1.head(5)
| car_name | |
|---|---|
| 0 | chevrolet chevelle malibu |
| 1 | buick skylark 320 |
| 2 | plymouth satellite |
| 3 | amc rebel sst |
| 4 | ford torino |
df2 = pd.read_json('Part1 - Car-Attributes.json')
df2.head(5)
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
print('Shape of Part1 - Car name.csv: ', df1.shape)
print('Shape of Part1 - Car-Attributes.json ', df2.shape)
Shape of Part1 - Car name.csv: (398, 1) Shape of Part1 - Car-Attributes.json (398, 8)
print('Information of Part1 - Car name.csv: \n', df1.info())
print('Information of Part1 - Car-Attributes.json: \n', df2.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object dtypes: object(1) memory usage: 3.2+ KB Information of Part1 - Car name.csv: None <class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cyl 398 non-null int64 2 disp 398 non-null float64 3 hp 398 non-null object 4 wt 398 non-null int64 5 acc 398 non-null float64 6 yr 398 non-null int64 7 origin 398 non-null int64 dtypes: float64(3), int64(4), object(1) memory usage: 25.0+ KB Information of Part1 - Car-Attributes.json: None
dataset = pd.merge(df1,df2,left_index=True,right_index=True)
dataset.head()
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
dataset.shape
(398, 9)
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null object 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
# exporting dataset to CSV
dataset.to_csv('final_dataset.csv')
# exporting dataset to excel
dataset.to_excel('final_dataset.xlsx')
# exporting dataset to excel
dataset.to_json('final_dataset.json')
# importing final data which we created, for model creation and analysis.
dataset_final = pd.read_csv('final_dataset.csv')
dataset_final.head()
| Unnamed: 0 | car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | 1 | buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | 2 | plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | 3 | amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | 4 | ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
dataset_final.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 398 non-null int64 1 car_name 398 non-null object 2 mpg 398 non-null float64 3 cyl 398 non-null int64 4 disp 398 non-null float64 5 hp 398 non-null object 6 wt 398 non-null int64 7 acc 398 non-null float64 8 yr 398 non-null int64 9 origin 398 non-null int64 dtypes: float64(3), int64(5), object(2) memory usage: 31.2+ KB
dataset_final.drop(columns='Unnamed: 0', inplace=True)
dataset_final.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 398 non-null object 1 mpg 398 non-null float64 2 cyl 398 non-null int64 3 disp 398 non-null float64 4 hp 398 non-null object 5 wt 398 non-null int64 6 acc 398 non-null float64 7 yr 398 non-null int64 8 origin 398 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
dataset_final.tail(10)
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 388 | chrysler lebaron medallion | 26.0 | 4 | 156.0 | 92 | 2585 | 14.5 | 82 | 1 |
| 389 | ford granada l | 22.0 | 6 | 232.0 | 112 | 2835 | 14.7 | 82 | 1 |
| 390 | toyota celica gt | 32.0 | 4 | 144.0 | 96 | 2665 | 13.9 | 82 | 3 |
| 391 | dodge charger 2.2 | 36.0 | 4 | 135.0 | 84 | 2370 | 13.0 | 82 | 1 |
| 392 | chevrolet camaro | 27.0 | 4 | 151.0 | 90 | 2950 | 17.3 | 82 | 1 |
| 393 | ford mustang gl | 27.0 | 4 | 140.0 | 86 | 2790 | 15.6 | 82 | 1 |
| 394 | vw pickup | 44.0 | 4 | 97.0 | 52 | 2130 | 24.6 | 82 | 2 |
| 395 | dodge rampage | 32.0 | 4 | 135.0 | 84 | 2295 | 11.6 | 82 | 1 |
| 396 | ford ranger | 28.0 | 4 | 120.0 | 79 | 2625 | 18.6 | 82 | 1 |
| 397 | chevy s-10 | 31.0 | 4 | 119.0 | 82 | 2720 | 19.4 | 82 | 1 |
dataset_final.hp.unique()
array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
'170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
'200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
'72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
'112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
'75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
'129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
'102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
'103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
'64', '74', '116', '82'], dtype=object)
# we can see that for col 'hp', we have unwarranted '?' present.
dataset_final[dataset_final.hp == '?']
| car_name | mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|---|
| 32 | ford pinto | 25.0 | 4 | 98.0 | ? | 2046 | 19.0 | 71 | 1 |
| 126 | ford maverick | 21.0 | 6 | 200.0 | ? | 2875 | 17.0 | 74 | 1 |
| 330 | renault lecar deluxe | 40.9 | 4 | 85.0 | ? | 1835 | 17.3 | 80 | 2 |
| 336 | ford mustang cobra | 23.6 | 4 | 140.0 | ? | 2905 | 14.3 | 80 | 1 |
| 354 | renault 18i | 34.5 | 4 | 100.0 | ? | 2320 | 15.8 | 81 | 2 |
| 374 | amc concord dl | 23.0 | 4 | 151.0 | ? | 3035 | 20.5 | 82 | 1 |
# we'll drop these records now from our dataset
dataset_final.hp.max()
'?'
# deleting the records with '?' values from our dataset.
x = dataset_final.hp.max()
dataset_final.drop(dataset_final[dataset_final['hp'] == x].index, inplace=True)
dataset_final.shape
(392, 9)
dataset_final.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 392 non-null object 1 mpg 392 non-null float64 2 cyl 392 non-null int64 3 disp 392 non-null float64 4 hp 392 non-null object 5 wt 392 non-null int64 6 acc 392 non-null float64 7 yr 392 non-null int64 8 origin 392 non-null int64 dtypes: float64(3), int64(4), object(2) memory usage: 30.6+ KB
dataset_final.hp = dataset_final.hp.astype(float)
dataset_final.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 car_name 392 non-null object 1 mpg 392 non-null float64 2 cyl 392 non-null int64 3 disp 392 non-null float64 4 hp 392 non-null float64 5 wt 392 non-null int64 6 acc 392 non-null float64 7 yr 392 non-null int64 8 origin 392 non-null int64 dtypes: float64(4), int64(4), object(1) memory usage: 30.6+ KB
# removing the car nane from our dataset.
dataset_final.drop(columns='car_name',inplace=True)
dataset_final_1 = dataset_final
dataset_final_1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 392 non-null float64 1 cyl 392 non-null int64 2 disp 392 non-null float64 3 hp 392 non-null float64 4 wt 392 non-null int64 5 acc 392 non-null float64 6 yr 392 non-null int64 7 origin 392 non-null int64 dtypes: float64(4), int64(4) memory usage: 27.6 KB
dataset_final_1.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| mpg | 392.0 | 23.445918 | 7.805007 | 9.0 | 17.000 | 22.75 | 29.000 | 46.6 |
| cyl | 392.0 | 5.471939 | 1.705783 | 3.0 | 4.000 | 4.00 | 8.000 | 8.0 |
| disp | 392.0 | 194.411990 | 104.644004 | 68.0 | 105.000 | 151.00 | 275.750 | 455.0 |
| hp | 392.0 | 104.469388 | 38.491160 | 46.0 | 75.000 | 93.50 | 126.000 | 230.0 |
| wt | 392.0 | 2977.584184 | 849.402560 | 1613.0 | 2225.250 | 2803.50 | 3614.750 | 5140.0 |
| acc | 392.0 | 15.541327 | 2.758864 | 8.0 | 13.775 | 15.50 | 17.025 | 24.8 |
| yr | 392.0 | 75.979592 | 3.683737 | 70.0 | 73.000 | 76.00 | 79.000 | 82.0 |
| origin | 392.0 | 1.576531 | 0.805518 | 1.0 | 1.000 | 1.00 | 2.000 | 3.0 |
# we'll start with the corr relationship between the colums that we have to understand their relationship:
correlation = dataset_final_1.corr()
plt.figure(figsize = (15, 10))
plt.xticks(fontsize = 15, rotation = 45)
plt.yticks(fontsize = 15)
plt.title('Correlation HeatMap', fontsize = 15)
sns.heatmap(correlation, annot = True, cmap = 'YlGnBu')
<AxesSubplot:title={'center':'Correlation HeatMap'}>
Observations:
sns.pairplot(dataset_final_1, diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x7fef988b88e0>
# we'll now start with Uni, Bi and multivariate analysis:
#1. CYL
#plt.figure(figsize = (7, 5))
sns.distplot(dataset_final_1.cyl, color='green').set_title('Desnsity plot for CYL')
plt.show()
sns.boxplot(dataset_final_1.cyl, color='blue').set_title('Box plot for CYL')
plt.show()
#2. HP
sns.distplot(dataset_final_1.hp, color='green').set_title('Desnsity plot for HP')
plt.show()
sns.boxplot(dataset_final_1.hp, color='blue').set_title('Box plot for HP')
plt.show()
#3. MPG
sns.distplot(dataset_final_1.mpg, color='green').set_title('Desnsity plot for MPG')
plt.show()
sns.boxplot(dataset_final_1.mpg, color='blue').set_title('Box plot for MPG')
plt.show()
#4. DISP
sns.distplot(dataset_final_1.disp, color='green').set_title('Desnsity plot for DISPLACEMENT')
plt.show()
sns.boxplot(dataset_final_1.disp, color='blue').set_title('Box plot for DISPLACEMENT}')
plt.show()
#5. Accelaration
sns.distplot(dataset_final_1.acc, color='green').set_title('Desnsity plot for Accelaration')
plt.show()
sns.boxplot(dataset_final_1.acc, color='blue').set_title('Box plot for Accelaration')
plt.show()
# we'll fix the outliers present for HP and ACC.
#replacing outliers with IQR (Q1 and Q3 +-1.5*IQR)
IQR1 = stats.iqr(dataset_final_1['hp'], interpolation = 'midpoint')
IQR2 = stats.iqr(dataset_final_1['acc'], interpolation = 'midpoint')
#now, HP and Acc are as:
Q3 = dataset_final_1['hp'].quantile(0.75)
dataset_final_1['hp'] = np.where(dataset_final_1["hp"] >(Q3+1.5*IQR1), 198.5,dataset_final_1['hp'])
sns.boxplot(dataset_final_1['hp']);
Q1 = dataset_final_1['acc'].quantile(0.25)
Q3 = dataset_final_1['acc'].quantile(0.75)
print(Q1, Q3)
dataset_final_1['acc'] = np.where(dataset_final_1["acc"] >(Q3+1.5*IQR2),22.10,dataset_final_1['acc'])
dataset_final_1['acc'] = np.where(dataset_final_1["acc"] <(Q1-1.5*IQR2),(Q1-1.5*IQR2),dataset_final_1['acc'])
sns.boxplot(dataset_final_1['acc']);
13.775000000000002 17.025
Observations:
# we'll start with K-means clustering technique:
dataset_final_1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 392 non-null float64 1 cyl 392 non-null int64 2 disp 392 non-null float64 3 hp 392 non-null float64 4 wt 392 non-null int64 5 acc 392 non-null float64 6 yr 392 non-null int64 7 origin 392 non-null int64 dtypes: float64(4), int64(4) memory usage: 27.6 KB
# the col origin is the categorical col
data = dataset_final_1.iloc[:,0:7]
data_scaled = data.apply(zscore)
data_scaled.head()
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| 0 | -0.698638 | 1.483947 | 1.077290 | 0.704168 | 0.620540 | -1.312137 | -1.625315 |
| 1 | -1.083498 | 1.483947 | 1.488732 | 1.650322 | 0.843334 | -1.498326 | -1.625315 |
| 2 | -0.698638 | 1.483947 | 1.182542 | 1.244827 | 0.540382 | -1.684515 | -1.625315 |
| 3 | -0.955212 | 1.483947 | 1.048584 | 1.244827 | 0.536845 | -1.312137 | -1.625315 |
| 4 | -0.826925 | 1.483947 | 1.029447 | 0.974498 | 0.555706 | -1.870705 | -1.625315 |
# now, for K-means clustering, we'll try to find the best number for 'K'.
from scipy.spatial.distance import cdist
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(data_scaled)
prediction=model.predict(data_scaled)
meanDistortions.append(sum(np.min(cdist(data_scaled, model.cluster_centers_, 'euclidean'), axis=1)) /
data_scaled.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
Text(0.5, 1.0, 'Selecting k with the Elbow Method')
final_model=KMeans(2)
final_model.fit(data_scaled)
prediction=final_model.predict(data_scaled)
#Appending the predictions to the dataset
data["Pred"] = prediction
data_scaled["Pred"] = prediction
print("Groups Assigned : \n")
data.sample(5)
Groups Assigned :
| mpg | cyl | disp | hp | wt | acc | yr | Pred | |
|---|---|---|---|---|---|---|---|---|
| 390 | 32.0 | 4 | 144.0 | 96.0 | 2665 | 13.9 | 82 | 1 |
| 217 | 30.0 | 4 | 111.0 | 80.0 | 2155 | 14.8 | 77 | 1 |
| 87 | 13.0 | 8 | 350.0 | 145.0 | 3988 | 13.0 | 73 | 0 |
| 383 | 38.0 | 4 | 91.0 | 67.0 | 1965 | 15.0 | 82 | 1 |
| 37 | 18.0 | 6 | 232.0 | 100.0 | 3288 | 15.5 | 71 | 1 |
data.groupby('Pred').count()
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| Pred | |||||||
| 0 | 100 | 100 | 100 | 100 | 100 | 100 | 100 |
| 1 | 292 | 292 | 292 | 292 | 292 | 292 | 292 |
data_scaled.groupby('Pred').mean().T
| Pred | 0 | 1 |
|---|---|---|
| mpg | -1.120060 | 0.383582 |
| cyl | 1.472207 | -0.504181 |
| disp | 1.453998 | -0.497945 |
| hp | 1.475148 | -0.505188 |
| wt | 1.354828 | -0.463982 |
| acc | -1.038159 | 0.355534 |
| yr | -0.606026 | 0.207543 |
#we'll now plot graph according to the clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x="mpg", y="hp", hue="Pred",
data=data_scaled,
palette=['green','blue']);
Observations:
data_scaled.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 392 non-null float64 1 cyl 392 non-null float64 2 disp 392 non-null float64 3 hp 392 non-null float64 4 wt 392 non-null float64 5 acc 392 non-null float64 6 yr 392 non-null float64 7 Pred 392 non-null int32 dtypes: float64(7), int32(1) memory usage: 26.0 KB
# now, we'll perform hierarchial clustering:
# again, we have taken the records and did the standadisation.
data_1 = dataset_final_1.iloc[:,0:7]
data_scaled_1 = data_1.apply(zscore)
data_scaled_1.head()
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| 0 | -0.698638 | 1.483947 | 1.077290 | 0.704168 | 0.620540 | -1.312137 | -1.625315 |
| 1 | -1.083498 | 1.483947 | 1.488732 | 1.650322 | 0.843334 | -1.498326 | -1.625315 |
| 2 | -0.698638 | 1.483947 | 1.182542 | 1.244827 | 0.540382 | -1.684515 | -1.625315 |
| 3 | -0.955212 | 1.483947 | 1.048584 | 1.244827 | 0.536845 | -1.312137 | -1.625315 |
| 4 | -0.826925 | 1.483947 | 1.029447 | 0.974498 | 0.555706 | -1.870705 | -1.625315 |
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist
# smaller the cophenet index the better is the clustering :
Z1 = linkage(data_scaled_1, metric='euclidean', method='centroid')
c1, coph_dists = cophenet(Z1 , pdist(data_scaled_1))
c1
0.7867138610584432
Z2 = linkage(data_scaled_1, metric='euclidean', method='complete')
c2, coph_dists = cophenet(Z2 , pdist(data_scaled_1))
c2
0.7624426859347365
Z3 = linkage(data_scaled_1, metric='euclidean', method='median')
c2, coph_dists = cophenet(Z3 , pdist(data_scaled_1))
c2
0.6842613728425571
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z1, leaf_rotation=90.,color_threshold=600, leaf_font_size=10, )
plt.tight_layout()
# dendrogram function to arrive at dendrogram
dendrogram(
Z1,
truncate_mode='lastp',
p=2,
)
plt.show()
#adding the new clusters formed to the dataset
from scipy.cluster.hierarchy import fcluster
clusters = fcluster(Z1, 2, criterion='maxclust')
data_scaled_1['clusters_H'] = clusters
data_scaled_1.head()
| mpg | cyl | disp | hp | wt | acc | yr | clusters_H | |
|---|---|---|---|---|---|---|---|---|
| 0 | -0.698638 | 1.483947 | 1.077290 | 0.704168 | 0.620540 | -1.312137 | -1.625315 | 1 |
| 1 | -1.083498 | 1.483947 | 1.488732 | 1.650322 | 0.843334 | -1.498326 | -1.625315 | 1 |
| 2 | -0.698638 | 1.483947 | 1.182542 | 1.244827 | 0.540382 | -1.684515 | -1.625315 | 1 |
| 3 | -0.955212 | 1.483947 | 1.048584 | 1.244827 | 0.536845 | -1.312137 | -1.625315 | 1 |
| 4 | -0.826925 | 1.483947 | 1.029447 | 0.974498 | 0.555706 | -1.870705 | -1.625315 | 1 |
#plotting the clusters formed
plt.figure(figsize=(10, 8))
sns.scatterplot(x="mpg", y="hp", hue="clusters_H",
data=data_scaled_1,
palette=['purple','green']);
# Answer 1 = The optimal number of clusters that can be derived for this dataset is - 2.
# we made this conclusion based on the kelbow method.
# we'll now put the Linear Regression model on the dataset:
#1. Using Orignal dataset:
# independet variable
X = dataset_final_1.drop(['mpg','origin'], axis=1)
# dependent variable
y = dataset_final_1[['mpg']]
# Split X and y into training and test set in 70:30 ratio and fitting the LR model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=20)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
LinearRegression()
regression_model.score(X_train, y_train)
0.8020479678803313
regression_model.score(X_test, y_test)
0.8251059038684684
Observation:
With LR, we can see that for our ORIGNAL dataset, we got 80% score for training data and 82.5% for test data.
Now, we'll perform LR on the K means clustered data
#Linear regression on data with K means cluster
dataset_final_2 = dataset_final_1
dataset_final_2['clusters_H'] = data_scaled_1['clusters_H']
dataset_final_2['clusters_H']=dataset_final_2['clusters_H'].astype('category')
dataset_final_2['clusters_H'] = dataset_final_2['clusters_H'].replace({1: 'Type 1', 0: 'Type 2'})
dataset_final_2.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | clusters_H | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | Type 1 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | Type 1 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | Type 1 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | Type 1 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | Type 1 |
dataset_final_2 = pd.get_dummies(dataset_final_2, columns=['clusters_H'])
dataset_final_2.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | clusters_H_2 | clusters_H_Type 1 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 | 0 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 | 0 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 | 0 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 | 0 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 | 0 | 1 |
# again dividing the data:
X = dataset_final_2.drop(['mpg'], axis=1)
# the dependent variable
y = dataset_final_2[['mpg','clusters_H_2']]
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
regression_model.score(X_train, y_train)
0.8014949133197267
regression_model.score(X_test, y_test)
0.8251059038684684
Observations:
Observations:
df = pd.read_excel('Part2 - Company.xlsx')
df.head()
| A | B | C | D | Quality | |
|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality A |
| 1 | 174 | 133 | 134 | 166 | Quality B |
| 2 | 159 | 163 | 135 | 131 | NaN |
| 3 | 61 | 23 | 3 | 44 | Quality A |
| 4 | 59 | 60 | 9 | 68 | Quality A |
df.shape
(61, 5)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A 61 non-null int64 1 B 61 non-null int64 2 C 61 non-null int64 3 D 61 non-null int64 4 Quality 43 non-null object dtypes: int64(4), object(1) memory usage: 2.5+ KB
# we can see that the Categorical field 'Quality' has NULL values:
df.Quality.isnull().value_counts()
False 43 True 18 Name: Quality, dtype: int64
df.Quality.value_counts()
Quality A 26 Quality B 17 Name: Quality, dtype: int64
sns.pairplot(df, diag_kind='kde', hue='Quality')
<seaborn.axisgrid.PairGrid at 0x7fef834e2dc0>
# now we'll produce K means clustering model to identify the missing Quaity.
# we'll perform the zscoring standarization on the dataset
df1 = df.drop(['Quality'], axis = 1)
df2 = df1.apply(zscore)
df2.head()
| A | B | C | D | |
|---|---|---|---|---|
| 0 | -1.168034 | -1.561080 | -1.061569 | -0.103138 |
| 1 | 0.904992 | 0.284923 | 0.306077 | 0.823013 |
| 2 | 0.660147 | 0.807376 | 0.321443 | 0.264129 |
| 3 | -0.939512 | -1.630740 | -1.706975 | -1.125099 |
| 4 | -0.972158 | -0.986381 | -1.614775 | -0.741864 |
df2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A 61 non-null float64 1 B 61 non-null float64 2 C 61 non-null float64 3 D 61 non-null float64 dtypes: float64(4) memory usage: 2.0 KB
#applying kmeans with 2 centroids
k_means = KMeans(n_clusters = 2)
k_means.fit(df2)
labels = k_means.labels_
# Calculating silhouette_score
from sklearn.metrics import silhouette_samples, silhouette_score
silhouette_score(df2,labels)
0.6891674125195145
# we'll append the newly created labels against the old data
labels
array([0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1], dtype=int32)
df['new_label'] = labels
df.head()
| A | B | C | D | Quality | new_label | |
|---|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality A | Quality A |
| 1 | 174 | 133 | 134 | 166 | Quality B | Quality B |
| 2 | 159 | 163 | 135 | 131 | NaN | Quality B |
| 3 | 61 | 23 | 3 | 44 | Quality A | Quality A |
| 4 | 59 | 60 | 9 | 68 | Quality A | Quality A |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A 61 non-null int64 1 B 61 non-null int64 2 C 61 non-null int64 3 D 61 non-null int64 4 Quality 43 non-null object 5 new_label 61 non-null object dtypes: int64(4), object(2) memory usage: 3.0+ KB
# we'll make some changes to compare data
df.new_label = df.new_label.replace({0:'Quality A', 1:"Quality B"})
df.head()
| A | B | C | D | Quality | new_label | |
|---|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality A | Quality A |
| 1 | 174 | 133 | 134 | 166 | Quality B | Quality B |
| 2 | 159 | 163 | 135 | 131 | NaN | Quality B |
| 3 | 61 | 23 | 3 | 44 | Quality A | Quality A |
| 4 | 59 | 60 | 9 | 68 | Quality A | Quality A |
# lets try and have a quick comparasion between new predicted labels and old labels:
df3 = df.dropna()
a = df3.Quality
b = df3.new_label
cm = confusion_matrix(a, b)
cm
array([[26, 0],
[ 0, 17]])
DOMAIN: Automobile
CONTEXT: The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
DATA DESCRIPTION: The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
All the features are numeric i.e. geometric features extracted from the silhouette.
PROJECT OBJECTIVE: Apply dimensionality reduction technique – PCA and train a model using principal components instead of training the model using just the raw data.
# 1. Data import and checks:
data = pd.read_csv("Part3 - vehicle.csv")
data.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
# we can see a some null values present in dataset.
data.isnull().sum()
compactness 0 circularity 5 distance_circularity 4 radius_ratio 6 pr.axis_aspect_ratio 2 max.length_aspect_ratio 0 scatter_ratio 1 elongatedness 1 pr.axis_rectangularity 3 max.length_rectangularity 0 scaled_variance 3 scaled_variance.1 2 scaled_radius_of_gyration 2 scaled_radius_of_gyration.1 4 skewness_about 6 skewness_about.1 1 skewness_about.2 1 hollows_ratio 0 class 0 dtype: int64
# we'll be replacing the missing values with their respective median values
for cols in data.columns:
if(cols != 'class'):
data[cols] = data[cols].fillna(data[cols].median())
data.isnull().sum()
compactness 0 circularity 0 distance_circularity 0 radius_ratio 0 pr.axis_aspect_ratio 0 max.length_aspect_ratio 0 scatter_ratio 0 elongatedness 0 pr.axis_rectangularity 0 max.length_rectangularity 0 scaled_variance 0 scaled_variance.1 0 scaled_radius_of_gyration 0 scaled_radius_of_gyration.1 0 skewness_about 0 skewness_about.1 0 skewness_about.2 0 hollows_ratio 0 class 0 dtype: int64
# now our dataset is ready to be worked upon
# 2. EDA and statistical anslysis of dataset.
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | 93.678487 | 8.234474 | 73.0 | 87.00 | 93.0 | 100.00 | 119.0 |
| circularity | 846.0 | 44.823877 | 6.134272 | 33.0 | 40.00 | 44.0 | 49.00 | 59.0 |
| distance_circularity | 846.0 | 82.100473 | 15.741569 | 40.0 | 70.00 | 80.0 | 98.00 | 112.0 |
| radius_ratio | 846.0 | 168.874704 | 33.401356 | 104.0 | 141.00 | 167.0 | 195.00 | 333.0 |
| pr.axis_aspect_ratio | 846.0 | 61.677305 | 7.882188 | 47.0 | 57.00 | 61.0 | 65.00 | 138.0 |
| max.length_aspect_ratio | 846.0 | 8.567376 | 4.601217 | 2.0 | 7.00 | 8.0 | 10.00 | 55.0 |
| scatter_ratio | 846.0 | 168.887707 | 33.197710 | 112.0 | 147.00 | 157.0 | 198.00 | 265.0 |
| elongatedness | 846.0 | 40.936170 | 7.811882 | 26.0 | 33.00 | 43.0 | 46.00 | 61.0 |
| pr.axis_rectangularity | 846.0 | 20.580378 | 2.588558 | 17.0 | 19.00 | 20.0 | 23.00 | 29.0 |
| max.length_rectangularity | 846.0 | 147.998818 | 14.515652 | 118.0 | 137.00 | 146.0 | 159.00 | 188.0 |
| scaled_variance | 846.0 | 188.596927 | 31.360427 | 130.0 | 167.00 | 179.0 | 217.00 | 320.0 |
| scaled_variance.1 | 846.0 | 439.314421 | 176.496341 | 184.0 | 318.25 | 363.5 | 586.75 | 1018.0 |
| scaled_radius_of_gyration | 846.0 | 174.706856 | 32.546277 | 109.0 | 149.00 | 173.5 | 198.00 | 268.0 |
| scaled_radius_of_gyration.1 | 846.0 | 72.443262 | 7.468734 | 59.0 | 67.00 | 71.5 | 75.00 | 135.0 |
| skewness_about | 846.0 | 6.361702 | 4.903244 | 0.0 | 2.00 | 6.0 | 9.00 | 22.0 |
| skewness_about.1 | 846.0 | 12.600473 | 8.930962 | 0.0 | 5.00 | 11.0 | 19.00 | 41.0 |
| skewness_about.2 | 846.0 | 188.918440 | 6.152247 | 176.0 | 184.00 | 188.0 | 193.00 | 206.0 |
| hollows_ratio | 846.0 | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.00 | 211.0 |
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x7f8b46bf9df0>
# well, that took some time :D.
# from the graphs, we see some positive and some negative relations between columns.
# we'll start only with numerical data first:
plt.figure(figsize=(20, 18))
col = 1
for i in data.drop(columns='class').columns:
plt.subplot(4, 5, col)
sns.distplot(data[i], color = 'b')
col += 1
Observations:
#Now, we'll be checking the distribution of the Object class-
print(data['class'].value_counts())
plt.title('Count of Vehicle Class column')
sns.countplot(x = 'class', data = data);
car 429 bus 218 van 199 Name: class, dtype: int64
# we can see that the class is not equally balanced among the 3 groups.
# before modelling, we'll balance out the classes to populate best results
# to check the outliers, we'll drow the boxplots-:
plt.figure(figsize=(25,23))
col = 1
for i in data.drop(columns='class').columns:
plt.subplot(6, 4, col)
sns.boxplot(data[i],color='blue')
col += 1
# as expected, the Cols with high right skewness in density graph shows more outliers present.
# we'll treat the outliers in uing the IQR by replacing them with median.
for col_name in data.drop(columns = 'class').columns:
q1 = data[col_name].quantile(0.25)
q3 = data[col_name].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr
data.loc[(data[col_name] < low) | (data[col_name] > high), col_name] = data[col_name].median()
# to check the outliers, we'll drow the boxplots-:
plt.figure(figsize=(25,23))
col = 1
for i in data.drop(columns='class').columns:
plt.subplot(6, 4, col)
sns.boxplot(data[i],color='blue')
col += 1
# we got good results in treating the outliers :)
# now, we will check the corelation between the cols using corr and graph.
# this always helps to understand the dependency of cols with other cols and helps identify which cols are more data centric.
plt.figure(figsize=(20,18))
corr=data.corr()
sns.heatmap(corr,annot=True,cmap = 'YlGnBu' );
# to check the distribution of dataset with target class:
# 1st, we'll split the dataset-
X = data.loc[:, data.columns != 'class']
y = data['class'].astype('category').cat.codes
# 2nd, we'll be plotting the correlation with variable class-
plt.figure(figsize = (15, 8))
ax=sns.barplot(x=X.columns, y = X.corrwith(y))
sns.barplot(x = X.columns, y = X.corrwith(y))
plt.title('Correlation with Class column', fontsize = 20)
x=plt.setp(ax.get_xticklabels(), rotation=90)
# Now, we have 18 dimensions present for the dataset, and as seen from the CORR relation plot,
# not all are proven to be contibuting to the dataset.
# Hence, we'll perform PCA to reduce the dimensions and for better model generation.
# 1st, scaling is needed:
XScaled=X.apply(zscore)
XScaled.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.160580 | 0.518073 | 0.057177 | 0.300945 | 1.933135 | 0.912212 | -0.207598 | 0.136262 | -0.224342 | 0.758332 | -0.400771 | -0.337407 | 0.285705 | -0.315806 | -0.032330 | 0.387162 | -0.312012 | 0.183957 |
| 1 | -0.325470 | -0.623732 | 0.120741 | -0.850666 | -0.740596 | 0.427456 | -0.599423 | 0.520519 | -0.610886 | -0.344578 | -0.594220 | -0.618623 | -0.513630 | 0.009122 | 0.624090 | 0.161740 | 0.013265 | 0.452977 |
| 2 | 1.254193 | 0.844303 | 1.519141 | 1.265808 | 0.863642 | 0.912212 | 1.148719 | -1.144597 | 0.935290 | 0.689401 | 1.114582 | 1.131806 | 1.392477 | 0.171586 | 1.718123 | -0.401818 | -0.149374 | 0.049447 |
| 3 | -0.082445 | -0.623732 | -0.006386 | -0.290423 | 0.328896 | 0.427456 | -0.750125 | 0.648605 | -0.610886 | -0.344578 | -0.916635 | -0.739145 | -1.466683 | -1.453054 | -0.032330 | -0.289106 | 1.639649 | 1.529056 |
| 4 | -1.054545 | -0.134387 | -0.769150 | 1.141310 | -0.027601 | -0.057300 | -0.599423 | 0.520519 | -0.610886 | -0.275646 | 1.694930 | -0.647319 | 0.408680 | -0.072110 | 0.624090 | -0.176395 | -1.450481 | -1.699181 |
#to view data before PCA, we'll be plotting the cummulative variance.
from sklearn.decomposition import PCA
pca = PCA()
X_pca_ = pca.fit_transform(XScaled)
plt.figure(figsize = (12, 8))
plt.plot((np.cumsum(pca.explained_variance_ratio_) * 100), marker = 'X')
plt.xlim(0, 18)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance in %');
pca.explained_variance_
array([9.74940269e+00, 3.35071912e+00, 1.19238155e+00, 1.13381916e+00,
8.83997312e-01, 6.66265745e-01, 3.18150910e-01, 2.28179142e-01,
1.31018595e-01, 7.98619108e-02, 7.33979478e-02, 6.46162669e-02,
4.01448646e-02, 3.22758478e-02, 2.93936408e-02, 2.27005257e-02,
1.98136761e-02, 5.16287320e-03])
# from above cofficients, we can deduce that the first 7 or 8 have some explanation on dataset,
#. while the rest are not contributing much.
# to make absolute decision, we'll plot the graph again with EXPLAINED VARAINECE
plt.figure(figsize = (12, 8))
plt.step(list(range(18)), (np.cumsum(pca.explained_variance_ratio_) * 100), where = 'mid')
plt.xlim(0, 18)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative EXPLAINED Variance in % ')
Text(0, 0.5, 'Cumulative EXPLAINED Variance in % ')
# we can see that almost 90% of data can be explained by 5 variables. 95% with 6 variables.
# after, the graph seems to become straight.
pca_n = PCA(n_components=6)
pca_n.fit(XScaled)
print('PCA components:\n',pca_n.components_)
print('-----------------------------------------------------------')
print('PCA explained components:\n',pca_n.explained_variance_ratio_)
Xpca_n = pca_n.transform(XScaled)
PCA components: [[ 0.27250289 0.28725469 0.30242111 0.26971354 0.09786073 0.19520014 0.31052393 -0.3090069 0.307287 0.27815416 0.29976509 0.30553237 0.26323762 -0.04193594 0.03608321 0.05872048 0.03801314 0.08474 ] [-0.08704358 0.13162176 -0.04614301 -0.19793126 -0.25783995 -0.10804563 0.07528535 -0.01322994 0.0875602 0.12215424 0.07726575 0.07150302 0.21058205 0.50362158 -0.01576632 -0.09274624 -0.50162122 -0.50761211] [-0.03818521 -0.20114691 0.06346211 0.05628517 -0.06199275 -0.14895782 0.10904283 -0.09085269 0.1060705 -0.21368469 0.1445998 0.11034374 -0.20287019 0.07386402 -0.55917399 0.6706805 -0.06224071 -0.04170535] [ 0.13867501 -0.03805548 0.10895429 -0.25435509 -0.61276572 0.27867816 0.00539295 0.06521486 0.03089915 0.04146747 -0.06400509 -0.00219687 -0.08553965 -0.11539962 0.47370331 0.42842603 -0.0274096 0.09603749] [ 0.13710147 -0.13899555 -0.08001743 0.13374437 0.12360146 -0.63489336 0.08555745 -0.07907344 0.08164638 -0.25111294 0.14747123 0.11010098 -0.00521211 0.1380686 0.56655224 0.13086982 0.18051929 -0.11078807] [ 0.26361138 -0.07134741 -0.01690062 -0.13818366 -0.57782861 -0.28909699 0.09774711 -0.07572829 0.10540323 -0.07819622 0.13291241 0.11539822 -0.0670574 -0.13151308 -0.31917609 -0.46840497 0.28013644 0.05944441]] ----------------------------------------------------------- PCA explained components: [0.54099325 0.18593103 0.06616512 0.0629155 0.04905291 0.03697101]
pca_6 = PCA(n_components = 6)
X_pca = pca_6.fit_transform(XScaled)
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_pca.shape[1])
Original number of features: 18 Reduced number of features: 6
# lets review some of the new PCA components:
pca_df = pd.DataFrame(data = X_pca)
pca_df.head(7)
| 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|
| 0 | 0.584228 | -0.675673 | -0.453334 | -0.750656 | -0.777515 | -1.848809 |
| 1 | -1.512180 | -0.348934 | -0.333436 | 1.268953 | -0.324929 | -0.118317 |
| 2 | 3.913448 | 0.234507 | -1.265094 | 0.137224 | 0.915751 | -0.685594 |
| 3 | -1.535193 | -3.044413 | -0.469623 | 0.324317 | -0.611590 | 0.367777 |
| 4 | -0.642062 | 1.488882 | -0.246288 | -0.550939 | 0.471655 | -1.012697 |
| 5 | 5.382325 | 4.658285 | 1.173395 | 0.242066 | 1.587378 | 2.508100 |
| 6 | -0.813699 | -2.220045 | -1.962160 | -0.410052 | 1.522932 | 0.655823 |
# splitting the orignal dataset.
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size = 0.3, random_state = 10)
# shapes of the train and test data:
print('train shape:',X_train.shape)
print('test shape:',X_test.shape)
train shape: (592, 18) test shape: (254, 18)
# now, we'll be splitting the PCA dataset.
X_tr, X_te, y_tr, y_te = train_test_split(X_pca, y, test_size = 0.3, random_state = 10)
# shapes of the train and test PCA data:
print('PCA train shape:',X_tr.shape)
print('PCA test shape:',X_te.shape)
PCA train shape: (592, 6) PCA test shape: (254, 6)
## Model creation:
# Support Vector Machine on train data
svc_model = SVC(C= 4, kernel='rbf', gamma='scale')
svc_model.fit(X_train, y_train)
SVC(C=4)
#predicting on train data
from sklearn import metrics
sv_train_predict = svc_model.predict(X_train)
print('Train accuracy:',metrics.accuracy_score(y_train, sv_train_predict))
Train accuracy: 0.9898648648648649
sv_test_predict = svc_model.predict(X_test)
print('Test accuracy:',metrics.accuracy_score(y_test, sv_test_predict))
Test accuracy: 0.968503937007874
# to view the results and visualize, we'll drow confusion matrix graph:
cm= confusion_matrix(y_test, sv_test_predict)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'YlGnBu', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
print("Classification Report")
print(metrics.classification_report(y_test, sv_test_predict, labels=[0,1,2]))
Classification Report
precision recall f1-score support
0 1.00 0.99 0.99 71
1 0.98 0.96 0.97 125
2 0.90 0.97 0.93 58
accuracy 0.97 254
macro avg 0.96 0.97 0.97 254
weighted avg 0.97 0.97 0.97 254
# Creating the PCA SVC model and predicting on train data Of PCA
svc_model_pca = SVC(C= 4, kernel='rbf', gamma='scale')
svc_model_pca.fit(X_tr, y_tr)
sv_tr_predict = svc_model_pca .predict(X_tr)
print('Train accuracy for PCA: ',metrics.accuracy_score(y_tr, sv_tr_predict))
Train accuracy for PCA: 0.9476351351351351
sv_te_predict = svc_model_pca .predict(X_te)
print('Test accuracy on PCA data: ',metrics.accuracy_score(y_te, sv_te_predict))
Test accuracy on PCA data: 0.9212598425196851
cm = confusion_matrix(y_te, sv_te_predict)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
print("Classification Report")
print(metrics.classification_report(y_te, sv_te_predict, labels=[0,1,2]))
Classification Report
precision recall f1-score support
0 0.96 0.94 0.95 71
1 0.94 0.91 0.93 125
2 0.84 0.91 0.88 58
accuracy 0.92 254
macro avg 0.91 0.92 0.92 254
weighted avg 0.92 0.92 0.92 254
Observations:
END of PART 3.
df = pd.read_csv('Part4 - batting_bowling_ipl_bat.csv')
df.head()
| Name | Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 180 entries, 0 to 179 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 90 non-null object 1 Runs 90 non-null float64 2 Ave 90 non-null float64 3 SR 90 non-null float64 4 Fours 90 non-null float64 5 Sixes 90 non-null float64 6 HF 90 non-null float64 dtypes: float64(6), object(1) memory usage: 10.0+ KB
df.shape
(180, 7)
df = df.dropna()
df = df.reset_index()
df.drop(columns='index', inplace=True)
print(df.head())
print(df.shape)
Name Runs Ave SR Fours Sixes HF 0 CH Gayle 733.0 61.08 160.74 46.0 59.0 9.0 1 G Gambhir 590.0 36.87 143.55 64.0 17.0 6.0 2 V Sehwag 495.0 33.00 161.23 57.0 19.0 5.0 3 CL White 479.0 43.54 149.68 41.0 20.0 5.0 4 S Dhawan 569.0 40.64 129.61 58.0 18.0 5.0 (90, 7)
# Now, we'll perform the EDA and Statistical analysis:
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7fef845f3cd0>
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Runs | 90.0 | 219.933333 | 156.253669 | 2.00 | 98.000 | 196.500 | 330.7500 | 733.00 |
| Ave | 90.0 | 24.729889 | 13.619215 | 0.50 | 14.665 | 24.440 | 32.1950 | 81.33 |
| SR | 90.0 | 119.164111 | 23.656547 | 18.18 | 108.745 | 120.135 | 131.9975 | 164.10 |
| Fours | 90.0 | 19.788889 | 16.399845 | 0.00 | 6.250 | 16.000 | 28.0000 | 73.00 |
| Sixes | 90.0 | 7.577778 | 8.001373 | 0.00 | 3.000 | 6.000 | 10.0000 | 59.00 |
| HF | 90.0 | 1.188889 | 1.688656 | 0.00 | 0.000 | 0.500 | 2.0000 | 9.00 |
- We can see that there's a positive relationship between some of the columns from dataset.
- Dataset was cleaned and prepared for the analysis.
- Before modelling, we have to make standarization of dataaset.
plt.figure(figsize=(15,8))
var=df.groupby('Name')['Runs'].sum().sort_values(ascending = False ).head(10)
var= var.reset_index()
var.columns = ['Name' ,'Runs']
sns.barplot(data= var , x= 'Name' , y ='Runs');
plt.title("---Name of top 10 Players according to the 'Runs'----");
plt.figure(figsize=(15,8))
var=df.groupby('Name')['Fours'].sum().sort_values(ascending = False ).head(10)
var= var.reset_index()
var.columns = ['Name' ,'Fours']
sns.barplot(data= var , x= 'Name' , y ='Fours');
plt.title("---Name of top 10 Players with maximum number of 4's'----");
plt.figure(figsize=(15,8))
var=df.groupby('Name')['Sixes'].sum().sort_values(ascending = False ).head(10)
var= var.reset_index()
var.columns = ['Name' ,'Sixes']
sns.barplot(data= var , x= 'Name' , y ='Sixes');
plt.title("---Name of top 10 Players with maximum number of 6's'----");
plt.figure(figsize=(15,8))
var=df.groupby('Name')['SR'].sum().sort_values(ascending = False ).head(10)
var= var.reset_index()
var.columns = ['Name' ,'SR']
sns.barplot(data= var , x= 'Name' , y ='SR');
plt.title("---Name of top 10 Players with maximum Strike Rate'----");
plt.figure(figsize=(15,8))
var=df.groupby('Name')['Ave'].sum().sort_values(ascending = False ).head(10)
var= var.reset_index()
var.columns = ['Name' ,'Ave']
sns.barplot(data= var , x= 'Name' , y ='Ave');
plt.title("---Name of top 10 Players with maximum average'----");
plt.figure(figsize=(15,8))
var=df.groupby('Name')['HF'].sum().sort_values(ascending = False ).head(10)
var= var.reset_index()
var.columns = ['Name' ,'HF']
sns.barplot(data= var , x= 'Name' , y ='HF');
plt.title("---Name of top 10 Players with maximum number of 4's'----");
plt.figure(figsize=(15, 12))
col = 1
for i in df.drop(columns='Name').columns:
plt.subplot(2, 3, col)
sns.distplot(df[i], color = 'b')
col += 1
#checking for correlation
plt.figure(figsize=(10,8))
corr=df.drop(columns='Name').corr()
sns.heatmap(corr,annot=True, cmap = 'YlGnBu');
Observations:
# we'll scale the data and perform the modelling task now:
df4 = df.iloc[:,1:7]
df41 = df4.apply(zscore)
df41.head()
| Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|
| 0 | 3.301945 | 2.683984 | 1.767325 | 1.607207 | 6.462679 | 4.651551 |
| 1 | 2.381639 | 0.896390 | 1.036605 | 2.710928 | 1.184173 | 2.865038 |
| 2 | 1.770248 | 0.610640 | 1.788154 | 2.281703 | 1.435530 | 2.269533 |
| 3 | 1.667276 | 1.388883 | 1.297182 | 1.300618 | 1.561209 | 2.269533 |
| 4 | 2.246490 | 1.174755 | 0.444038 | 2.343021 | 1.309851 | 2.269533 |
#plotting to check for optimal clustres
clusters=range(1,10)
meanDistortions=[]
for k in clusters:
model=KMeans(n_clusters=k)
model.fit(df41)
prediction=model.predict(df41)
meanDistortions.append(sum(np.min(cdist(df41, model.cluster_centers_, 'euclidean'), axis=1)) /
df41.shape[0])
plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
Text(0.5, 1.0, 'Selecting k with the Elbow Method')
# The elbow here is 2.
k_means = KMeans(n_clusters = 2)
k_means.fit(df41)
labels = k_means.labels_
# Calculating silhouette_score
silhouette_score(df41,labels)
0.41111085574076756
#merging the predicted value to dataset
df['cluster']=labels
df.head()
| Name | Runs | Ave | SR | Fours | Sixes | HF | cluster | |
|---|---|---|---|---|---|---|---|---|
| 0 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 | 0 |
| 1 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 | 0 |
| 2 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 | 0 |
| 3 | CL White | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 | 0 |
| 4 | S Dhawan | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 | 0 |
df.cluster.value_counts().sort_index()
0 35 1 55 Name: cluster, dtype: int64
df['cluster'] = df['cluster'].replace({1: 'Grade B', 0: 'Grade A'})
Grade_A = df[df['cluster'] == 'Grade A']
Grade_A.head(10)
| Name | Runs | Ave | SR | Fours | Sixes | HF | cluster | |
|---|---|---|---|---|---|---|---|---|
| 0 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 | Grade A |
| 1 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 | Grade A |
| 2 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 | Grade A |
| 3 | CL White | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 | Grade A |
| 4 | S Dhawan | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 | Grade A |
| 5 | AM Rahane | 560.0 | 40.00 | 129.33 | 73.0 | 10.0 | 5.0 | Grade A |
| 6 | KP Pietersen | 305.0 | 61.00 | 147.34 | 22.0 | 20.0 | 3.0 | Grade A |
| 7 | RG Sharma | 433.0 | 30.92 | 126.60 | 39.0 | 18.0 | 5.0 | Grade A |
| 8 | AB de Villiers | 319.0 | 39.87 | 161.11 | 26.0 | 15.0 | 3.0 | Grade A |
| 9 | JP Duminy | 244.0 | 81.33 | 128.42 | 13.0 | 11.0 | 2.0 | Grade A |
Grade_B = df[df['cluster'] == 'Grade B']
Grade_B.head(10)
| Name | Runs | Ave | SR | Fours | Sixes | HF | cluster | |
|---|---|---|---|---|---|---|---|---|
| 34 | MS Bisla | 213.0 | 30.42 | 133.12 | 16.0 | 10.0 | 1.0 | Grade B |
| 36 | BJ Hodge | 245.0 | 30.62 | 140.00 | 18.0 | 9.0 | 0.0 | Grade B |
| 37 | NV Ojha | 255.0 | 23.18 | 113.83 | 21.0 | 13.0 | 1.0 | Grade B |
| 38 | DB Das | 126.0 | 42.00 | 135.48 | 9.0 | 6.0 | 0.0 | Grade B |
| 39 | AC Gilchrist | 172.0 | 34.40 | 120.27 | 21.0 | 4.0 | 1.0 | Grade B |
| 40 | BB McCullum | 289.0 | 24.08 | 102.12 | 37.0 | 3.0 | 1.0 | Grade B |
| 41 | IK Pathan | 176.0 | 25.14 | 139.68 | 14.0 | 6.0 | 0.0 | Grade B |
| 42 | Azhar Mahmood | 186.0 | 23.25 | 130.98 | 16.0 | 8.0 | 0.0 | Grade B |
| 43 | MK Pandey | 143.0 | 20.42 | 127.67 | 12.0 | 6.0 | 1.0 | Grade B |
| 44 | S Badrinath | 196.0 | 28.00 | 108.28 | 23.0 | 2.0 | 1.0 | Grade B |
plt.figure(figsize=(15, 12))
sns.scatterplot(x='Runs', y = 'Fours',data = df, hue='cluster')
<AxesSubplot:xlabel='Runs', ylabel='Fours'>
df.groupby('cluster').mean().T
| cluster | Grade A | Grade B |
|---|---|---|
| Runs | 372.657143 | 122.745455 |
| Ave | 35.962286 | 17.582000 |
| SR | 132.808286 | 110.481455 |
| Fours | 34.685714 | 10.309091 |
| Sixes | 13.771429 | 3.636364 |
| HF | 2.714286 | 0.218182 |
Observations:
Questions: [ Total Score: 5 points]
Random Forest:
Missing Value Ratio:
High Correlation filter:
Low Variance filter:
Principal Component Analysis:
Factor Analysis:
Independent Component Analysis:
# importing the dummy dataset from the std library
from sklearn.datasets import load_digits
df = load_digits()
df.images.shape
(1797, 8, 8)
# the dataset here is the 3 dimesion array with 1797 records with 8x8 grid.
# to view such distribution, we'll use matplotlib - 1st 100 data
import matplotlib.pyplot as plt
fig, axes = plt.subplots(10, 10, figsize=(8, 8))
for i,ax in enumerate(axes.flat):
ax.imshow(df.images[i], cmap='binary', interpolation='nearest')
ax.text(0.05, 0.05, str(df.target[i]),
transform=ax.transAxes, color='green')
# we can see the images populated for the digits: the correct digit value is written in green sub-script.
data = df.data
print(data.shape)
(1797, 64)
target = df.target
print(target.shape)
(1797,)
# we have now divided the dataset into DATA and TARGET variables.
# Data stores the actual data par in 2d array. (1797 rows with 64 features)
# Target holds the resultset for the data.
#transform the data to two dimensions using manifold learning algorithm --> Isomap
from sklearn.manifold import Isomap
iso = Isomap(n_components=2)
iso.fit(df.data)
data_projected = iso.transform(df.data)
data_projected.shape
(1797, 2)
# we'll now split the data:
X = data
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)
# using NB model:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(y_test, pred)
0.8407407407407408
# checking the model's prediction:
plt.figure(figsize = (12, 8))
mat = confusion_matrix(y_test, pred)
sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('Actual value');
plt.title('Confusion Matrix HeatMap', fontsize = 15);
fig, axes = plt.subplots(10, 10, figsize=(8, 8),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(df.images[i], cmap='binary', interpolation='nearest')
ax.text(0.05, 0.05, str(pred[i]),
transform=ax.transAxes,
color='green' if (y_test[i] == pred[i]) else 'red')
Observations: